{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "51CTO课程频道：http://edu.51cto.com/lecturer/index/user_id-12330098.html<br>\n",
    "优酷频道：http://i.youku.com/sdxxqbf<br>\n",
    "微信公众号：深度学习与神经网络<br>\n",
    "Github：https://github.com/Qinbf<br>\n",
    "\n",
    "question_train_set.txt：  \n",
    "    第一列为 问题id；  \n",
    "    第二列为 title 的字符编号序列；  \n",
    "    第三列为 title 的词语编号序列；  \n",
    "    第四列为描述的字符编号序列；  \n",
    "    第五列为描述的词语标号序列。  \n",
    "    \n",
    "question_topic_train_set.txt：  \n",
    "    第一列 问题 id；  \n",
    "    第二列 话题 id。  \n",
    "\n",
    "topic_info.txt：  \n",
    "    第一列为话题 id  \n",
    "    第二列为话题的父话题 id。话题之间是有向无环图结构，一个话题可能有 0 到多个父话题；  \n",
    "    第三列为话题名字的字符编号序列；  \n",
    "    第四列为话题名字的词语编号序列；  \n",
    "    第五列为话题描述的字符编号序列；  \n",
    "    第六列为话题描述的词语编号序列。  \n",
    "\n",
    "1.title通常来说包含的信息最重要。对于question_train_set.txt文件，为了简单起见，我们只取第三列，title的词语编号序列。    \n",
    "2.对于topic_info.txt，为了简单起见，我们不考虑2,3,4,5,6列。只是简单的提取话题id，然后转为0-1998的数字（一共有1999个话题）  \n",
    "3.然后合并以上一些数据，得到最后处理后的数据。  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from tqdm import tqdm # pip install tqdm\n",
    "from six.moves import xrange"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                     0                                                  1  \\\n",
      "0  6555699376639805223  c324,c39,c40,c155,c180,c180,c181,c17,c4,c1153,...   \n",
      "1  2887834264226772863  c44,c110,c101,c286,c106,c150,c101,c892,c632,c1...   \n",
      "2 -2687466858632038806  c15,c768,c769,c1363,c650,c1218,c2361,c11,c90,c...   \n",
      "3    -5698296155734268  c473,c1528,c528,c428,c295,c15,c101,c188,c146,c...   \n",
      "4 -6719100304248915192  c190,c147,c105,c219,c220,c101,c647,c219,c220,c...   \n",
      "\n",
      "                                                   2  \\\n",
      "0  w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w...   \n",
      "1  w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w...   \n",
      "2  w875,w15450,w42394,w15863,w6,w95421,w25,w803,w...   \n",
      "3  w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w14...   \n",
      "4  w380,w54,w674,w133,w54,w134,w614,w54,w929,w307...   \n",
      "\n",
      "                                                   3  \\\n",
      "0  c335,c101,c611,c189,c97,c144,c147,c101,c15,c76...   \n",
      "1  c1265,c518,c74,c131,c274,c57,c768,c769,c368,c3...   \n",
      "2  c693,c100,c279,c99,c189,c532,c101,c189,c145,c1...   \n",
      "3                                                NaN   \n",
      "4  c644,c1212,c253,c199,c431,c452,c424,c207,c2,c1...   \n",
      "\n",
      "                                                   4  \n",
      "0  w231,w54,w1681,w54,w11506,w5714,w7,w54,w744,w1...  \n",
      "1                  w12508,w1380,w72,w27045,w276,w111  \n",
      "2  w140340,w54,w48398,w54,w140341,w54,w12856,w54,...  \n",
      "3                                                NaN  \n",
      "4  w4821,w1301,w16003,w928,w1961,w2565,w50803,w11...  \n"
     ]
    }
   ],
   "source": [
    "# 导入question_train_set\n",
    "reader = pd.read_table('./ieee_zhihu_cup/question_train_set.txt',sep='\\t',header=None)\n",
    "print(reader.iloc[0:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                     0                                                  1\n",
      "0  6555699376639805223            7739004195693774975,3738968195649774859\n",
      "1  2887834264226772863                               -3149765934180654494\n",
      "2 -2687466858632038806                                -760432988437306018\n",
      "3    -5698296155734268           -6758942141122113907,3195914392210930723\n",
      "4 -6719100304248915192  3804601920633030746,4797226510592237555,435133...\n"
     ]
    }
   ],
   "source": [
    "# 导入question_topic_eval_set\n",
    "topic_reader = pd.read_table('./ieee_zhihu_cup/question_topic_train_set.txt',sep='\\t',header=None)\n",
    "print(topic_reader.iloc[0:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                   0  \\\n",
      "0  w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w...   \n",
      "1  w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w...   \n",
      "2  w875,w15450,w42394,w15863,w6,w95421,w25,w803,w...   \n",
      "3  w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w14...   \n",
      "4  w380,w54,w674,w133,w54,w134,w614,w54,w929,w307...   \n",
      "\n",
      "                                                   1  \n",
      "0            7739004195693774975,3738968195649774859  \n",
      "1                               -3149765934180654494  \n",
      "2                                -760432988437306018  \n",
      "3           -6758942141122113907,3195914392210930723  \n",
      "4  3804601920633030746,4797226510592237555,435133...  \n"
     ]
    }
   ],
   "source": [
    "# 合并title 的词语编号序列和话题 id\n",
    "data_topic = pd.concat([reader.ix[:,2], topic_reader.ix[:,1]], axis=1, ignore_index=True)\n",
    "print(data_topic.iloc[0:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                     0                                                  1  \\\n",
      "0   738845194850773558                               -5833678375673307423   \n",
      "1  3738968195649774859                                2027693463582123305   \n",
      "2  4738849194894773882                                1127459907694805235   \n",
      "3  7739004195693774975  2904932941037075699,1160326435131345730,725917...   \n",
      "4 -7261194805221226386                               -5833678375673307423   \n",
      "\n",
      "                  2     3                                                  4  \\\n",
      "0             c0,c1    w0  c0,c1,c2,c3,c4,c5,c6,c7,c0,c1,c8,c9,c10,c11,c1...   \n",
      "1           c39,c40   w24  c41,c42,c43,c39,c40,c4,c44,c45,c46,c47,c48,c49...   \n",
      "2    c172,c31,c0,c1  w102                                                NaN   \n",
      "3   c39,c40,c5,c173  w103  c39,c40,c23,c21,c174,c74,c5,c173,c17,c35,c39,c...   \n",
      "4  c36,c31,c45,c237  w148                                          c238,c239   \n",
      "\n",
      "                                                   5  \n",
      "0  w0,w1,w2,w3,w4,w5,w6,w7,w8,w9,w10,w11,w12,w13,...  \n",
      "1  w24,w25,w26,w27,w28,w6,w29,w30,w11,w31,w32,w33...  \n",
      "2                                                NaN  \n",
      "3  w104,w105,w11,w21,w24,w6,w106,w23,w54,w24,w107...  \n",
      "4                                          w149,w150  \n"
     ]
    }
   ],
   "source": [
    "# 导入topic_info\n",
    "label_reader = pd.read_table('./ieee_zhihu_cup/topic_info.txt',sep='\\t',header=None)\n",
    "print(label_reader.iloc[0:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3\n"
     ]
    }
   ],
   "source": [
    "# 把标签转为0-1998的编号\n",
    "labels = list(label_reader.iloc[:,0])\n",
    "my_labels = []\n",
    "for label in labels:\n",
    "    my_labels.append(label)\n",
    "    \n",
    "# 建立topic字典\n",
    "topic_dict = {}\n",
    "for i,label in enumerate(my_labels):\n",
    "    topic_dict[label] = i\n",
    "\n",
    "print(topic_dict[7739004195693774975])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████████████████████████████████████████████████████████████████| 2999967/2999967 [12:15<00:00, 4076.87it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                   0                1\n",
      "0  w305,w13549,w22752,w11,w7225,w2565,w1106,w16,w...              3,1\n",
      "1  w377,w54,w285,w57,w349,w54,w108215,w6,w47986,w...              769\n",
      "2  w875,w15450,w42394,w15863,w6,w95421,w25,w803,w...              342\n",
      "3  w8646,w2744,w1462,w9,w54,w138,w54,w50,w110,w14...          1842,12\n",
      "4  w380,w54,w674,w133,w54,w134,w614,w54,w929,w307...  155,150,110,7,6\n"
     ]
    }
   ],
   "source": [
    "for i in tqdm(xrange(data_topic.shape[0])):\n",
    "    new_label = ''\n",
    "    # 根据“,”切分话题id\n",
    "    temp_topic = data_topic.iloc[i][1].split(',')\n",
    "    for topic in temp_topic:\n",
    "        # 判断该label是否在label文件中，并得到该行\n",
    "        label_num = topic_dict[int(topic)]\n",
    "        new_label = new_label + str(label_num) + ','\n",
    "    data_topic.iloc[i][1] = new_label[:-1]\n",
    "print(data_topic.iloc[:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 保存处理过后的文件\n",
    "data_topic.to_csv(\"./ieee_zhihu_cup/data_topic.txt\", header=None, index=None, sep='\\t')\n",
    "\n",
    "# 切分成10块保存\n",
    "for i in xrange(10):\n",
    "    data_topic_filename = './ieee_zhihu_cup/data_topic_block_' + str(i) + '.txt'\n",
    "    if (i+1)*300000 < data_topic.shape[0]:\n",
    "        data_topic.iloc[i*300000:(i+1)*300000].to_csv(\n",
    "            data_topic_filename, header=None, index=None, sep='\\t')\n",
    "    else:\n",
    "        data_topic.iloc[i*300000:data_topic.shape[0]].to_csv(\n",
    "            data_topic_filename, header=None, index=None, sep='\\t')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
